import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
1.1a
path = "TelcomCustomer-Churn_1.csv"
telecom1 = pd.read_csv(path)
telecom1
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No |
| 1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes |
| 2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes |
| 3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes |
| 4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7038 | 6840-RESVB | Male | 0 | Yes | Yes | 24 | Yes | Yes | DSL | Yes |
| 7039 | 2234-XADUH | Female | 0 | Yes | Yes | 72 | Yes | Yes | Fiber optic | No |
| 7040 | 4801-JZAZL | Female | 0 | Yes | Yes | 11 | No | No phone service | DSL | Yes |
| 7041 | 8361-LTMKD | Male | 1 | Yes | No | 4 | Yes | Yes | Fiber optic | No |
| 7042 | 3186-AJIEK | Male | 0 | No | No | 66 | Yes | No | Fiber optic | Yes |
7043 rows × 10 columns
1.1b
path = "TelcomCustomer-Churn_2.csv"
telecom2 = pd.read_csv(path)
telecom2
| customerID | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | Yes | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
| 1 | 5575-GNVDE | No | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
| 2 | 3668-QPYBK | Yes | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 3 | 7795-CFOCW | No | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
| 4 | 9237-HQITU | No | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7038 | 6840-RESVB | No | Yes | Yes | Yes | Yes | One year | Yes | Mailed check | 84.80 | 1990.5 | No |
| 7039 | 2234-XADUH | Yes | Yes | No | Yes | Yes | One year | Yes | Credit card (automatic) | 103.20 | 7362.9 | No |
| 7040 | 4801-JZAZL | No | No | No | No | No | Month-to-month | Yes | Electronic check | 29.60 | 346.45 | No |
| 7041 | 8361-LTMKD | No | No | No | No | No | Month-to-month | Yes | Mailed check | 74.40 | 306.6 | Yes |
| 7042 | 3186-AJIEK | No | Yes | Yes | Yes | Yes | Two year | Yes | Bank transfer (automatic) | 105.65 | 6844.5 | No |
7043 rows × 12 columns
1.1c
telecom = pd.merge(telecom1,telecom2,on = 'customerID', how='left')
telecom
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
| 1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | ... | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
| 2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | ... | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | ... | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
| 4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7038 | 6840-RESVB | Male | 0 | Yes | Yes | 24 | Yes | Yes | DSL | Yes | ... | Yes | Yes | Yes | Yes | One year | Yes | Mailed check | 84.80 | 1990.5 | No |
| 7039 | 2234-XADUH | Female | 0 | Yes | Yes | 72 | Yes | Yes | Fiber optic | No | ... | Yes | No | Yes | Yes | One year | Yes | Credit card (automatic) | 103.20 | 7362.9 | No |
| 7040 | 4801-JZAZL | Female | 0 | Yes | Yes | 11 | No | No phone service | DSL | Yes | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 29.60 | 346.45 | No |
| 7041 | 8361-LTMKD | Male | 1 | Yes | No | 4 | Yes | Yes | Fiber optic | No | ... | No | No | No | No | Month-to-month | Yes | Mailed check | 74.40 | 306.6 | Yes |
| 7042 | 3186-AJIEK | Male | 0 | No | No | 66 | Yes | No | Fiber optic | Yes | ... | Yes | Yes | Yes | Yes | Two year | Yes | Bank transfer (automatic) | 105.65 | 6844.5 | No |
7043 rows × 21 columns
1.1d
names1 = telecom1.columns.values.tolist()
names2 = telecom2.columns.values.tolist()
names3 = telecom.columns.values.tolist()
print(names1)
print(names2)
print(names3)
['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity'] ['customerID', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'] ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']
From the above we can say that all the columns are incorporated in the merged dataframe but the sum of number of columns in first and second dataframe is not equal to number of columns in merged dataframe as both of them contain same coulmn('customer ID') on which we merged.
1.2 a & b
telecom.isnull().sum()
customerID 0 gender 0 SeniorCitizen 0 Partner 0 Dependents 0 tenure 0 PhoneService 0 MultipleLines 0 InternetService 0 OnlineSecurity 0 OnlineBackup 0 DeviceProtection 0 TechSupport 0 StreamingTV 0 StreamingMovies 0 Contract 0 PaperlessBilling 0 PaymentMethod 0 MonthlyCharges 0 TotalCharges 0 Churn 0 dtype: int64
Indicates that there were no missing values. However, this is done before converting continuous values to float(where the datatype is object even for continuous values).
telecom.dtypes
customerID object gender object SeniorCitizen int64 Partner object Dependents object tenure int64 PhoneService object MultipleLines object InternetService object OnlineSecurity object OnlineBackup object DeviceProtection object TechSupport object StreamingTV object StreamingMovies object Contract object PaperlessBilling object PaymentMethod object MonthlyCharges float64 TotalCharges object Churn object dtype: object
On observing values in the dataset we can notice that SeniorCitizen, tenure, MonthlyCharges and TotalCharges are represented in numbers. But TotalCharges is of 'object' datatype. Hence, we need to convert it into 'float' as it is having continuous numeric value.
1.2b
telecom['TotalCharges']=pd.to_numeric(telecom['TotalCharges'],errors="coerce")
telecom.dtypes
customerID object gender object SeniorCitizen int64 Partner object Dependents object tenure int64 PhoneService object MultipleLines object InternetService object OnlineSecurity object OnlineBackup object DeviceProtection object TechSupport object StreamingTV object StreamingMovies object Contract object PaperlessBilling object PaymentMethod object MonthlyCharges float64 TotalCharges float64 Churn object dtype: object
telecom.isnull().sum()
customerID 0 gender 0 SeniorCitizen 0 Partner 0 Dependents 0 tenure 0 PhoneService 0 MultipleLines 0 InternetService 0 OnlineSecurity 0 OnlineBackup 0 DeviceProtection 0 TechSupport 0 StreamingTV 0 StreamingMovies 0 Contract 0 PaperlessBilling 0 PaymentMethod 0 MonthlyCharges 0 TotalCharges 11 Churn 0 dtype: int64
TotalCharges have 11 missing values. As this number is negligable when compared to total number of values we can drop them.
telecom = telecom.dropna()
telecom.isnull().sum()
customerID 0 gender 0 SeniorCitizen 0 Partner 0 Dependents 0 tenure 0 PhoneService 0 MultipleLines 0 InternetService 0 OnlineSecurity 0 OnlineBackup 0 DeviceProtection 0 TechSupport 0 StreamingTV 0 StreamingMovies 0 Contract 0 PaperlessBilling 0 PaymentMethod 0 MonthlyCharges 0 TotalCharges 0 Churn 0 dtype: int64
Now the data is clean and doesn't have any missing values.
telecom[['tenure', 'MonthlyCharges', 'TotalCharges']].describe()
| tenure | MonthlyCharges | TotalCharges | |
|---|---|---|---|
| count | 7032.000000 | 7032.000000 | 7032.000000 |
| mean | 32.421786 | 64.798208 | 2283.300441 |
| std | 24.545260 | 30.085974 | 2266.771362 |
| min | 1.000000 | 18.250000 | 18.800000 |
| 25% | 9.000000 | 35.587500 | 401.450000 |
| 50% | 29.000000 | 70.350000 | 1397.475000 |
| 75% | 55.000000 | 89.862500 | 3794.737500 |
| max | 72.000000 | 118.750000 | 8684.800000 |
1.2 c
data_size = telecom.Churn.value_counts(sort=True)
colors = ['blue','orange']
labels = 'Yes', 'No'
explode = (0, 0.1)
plt.pie(data_size, explode=explode, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=90,)
plt.title('Percentage of Churn in dataset')
plt.show()
data_size = telecom.gender.value_counts(sort=True)
colors = ['red','orange']
labels = 'Yes', 'No'
explode = (0, 0.1)
plt.pie(data_size, explode=explode, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=90,)
plt.title('Percentage of males and females in dataset')
plt.show()
data_size = telecom.Partner.value_counts(sort=True)
colors = ['red','orange']
labels = 'Yes', 'No'
explode = (0, 0.1)
plt.pie(data_size, explode=explode, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=90,)
plt.title('Percentage of persons having partner in dataset')
plt.show()
data_size = telecom.Dependents.value_counts(sort=True)
colors = ['blue','orange']
labels = 'Yes', 'No'
explode = (0, 0.1)
plt.pie(data_size, explode=explode, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=90,)
plt.title('Percentage of people with dependents in dataset')
plt.show()
data_size = telecom.PhoneService.value_counts(sort=True)
colors = ['blue','orange']
labels = 'Yes', 'No'
explode = (0, 0.1)
plt.pie(data_size, explode=explode, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=90,)
plt.title('Percentage of customers having PhoneService in dataset')
plt.show()
1-2d
Insights from pie-chart:-
-Only 26.6% of customers have churned out and 73.4% have stayed with the company.
-Data has almost equal number of males and females as customers.
-Almost half of the customers have partners.
-70.2% have dependents and 29.8% doesn't not have dependents.
-Approximately 10% customers do not have phoneservice and remaining 90% have phoneservice.
print(telecom.gender.value_counts())
print(telecom.SeniorCitizen.value_counts())
print(telecom.Partner.value_counts())
print(telecom.Dependents.value_counts())
print(telecom.PhoneService.value_counts())
print(telecom.MultipleLines.value_counts())
print(telecom.InternetService.value_counts())
print(telecom.OnlineSecurity.value_counts())
print(telecom.DeviceProtection.value_counts())
print(telecom.TechSupport.value_counts())
print(telecom.StreamingTV.value_counts())
print(telecom.StreamingMovies.value_counts())
print(telecom.Contract.value_counts())
print(telecom.PaperlessBilling.value_counts())
print(telecom.PaymentMethod.value_counts())
print(telecom.Churn.value_counts())
Male 3549 Female 3483 Name: gender, dtype: int64 0 5890 1 1142 Name: SeniorCitizen, dtype: int64 No 3639 Yes 3393 Name: Partner, dtype: int64 No 4933 Yes 2099 Name: Dependents, dtype: int64 Yes 6352 No 680 Name: PhoneService, dtype: int64 No 3385 Yes 2967 No phone service 680 Name: MultipleLines, dtype: int64 Fiber optic 3096 DSL 2416 No 1520 Name: InternetService, dtype: int64 No 3497 Yes 2015 No internet service 1520 Name: OnlineSecurity, dtype: int64 No 3094 Yes 2418 No internet service 1520 Name: DeviceProtection, dtype: int64 No 3472 Yes 2040 No internet service 1520 Name: TechSupport, dtype: int64 No 2809 Yes 2703 No internet service 1520 Name: StreamingTV, dtype: int64 No 2781 Yes 2731 No internet service 1520 Name: StreamingMovies, dtype: int64 Month-to-month 3875 Two year 1685 One year 1472 Name: Contract, dtype: int64 Yes 4168 No 2864 Name: PaperlessBilling, dtype: int64 Electronic check 2365 Mailed check 1604 Bank transfer (automatic) 1542 Credit card (automatic) 1521 Name: PaymentMethod, dtype: int64 No 5163 Yes 1869 Name: Churn, dtype: int64
CustomerID can be dropped from the data
telecom.drop('customerID', axis=1, inplace=True)
1-2e
telecom = telecom.apply(LabelEncoder().fit_transform)
telecom.head()
| gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 142 | 74 | 0 |
| 1 | 1 | 0 | 0 | 0 | 33 | 1 | 0 | 0 | 2 | 0 | 2 | 0 | 0 | 0 | 1 | 0 | 3 | 497 | 3624 | 0 |
| 2 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 435 | 536 | 1 |
| 3 | 1 | 0 | 0 | 0 | 44 | 0 | 1 | 0 | 2 | 0 | 2 | 2 | 0 | 0 | 1 | 0 | 0 | 266 | 3570 | 0 |
| 4 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 728 | 674 | 1 |
1-2 f
x = telecom.drop('Churn', axis=1)
y = telecom.pop('Churn')
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)
1-2 g
sc = StandardScaler()
x_train_sc = sc.fit_transform(x_train)
x_test_sc = sc.fit_transform(x_test)
1.3 a
pip install xgboost
Requirement already satisfied: xgboost in c:\users\pandu\anaconda3\lib\site-packages (1.7.1) Requirement already satisfied: numpy in c:\users\pandu\anaconda3\lib\site-packages (from xgboost) (1.21.5) Requirement already satisfied: scipy in c:\users\pandu\anaconda3\lib\site-packages (from xgboost) (1.7.3) Note: you may need to restart the kernel to use updated packages.
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_fscore_support as score
# Printing list of parameters in XGBoost
xgboost = XGBClassifier()
xgboost.get_params()
{'objective': 'binary:logistic',
'use_label_encoder': None,
'base_score': None,
'booster': None,
'callbacks': None,
'colsample_bylevel': None,
'colsample_bynode': None,
'colsample_bytree': None,
'early_stopping_rounds': None,
'enable_categorical': False,
'eval_metric': None,
'feature_types': None,
'gamma': None,
'gpu_id': None,
'grow_policy': None,
'importance_type': None,
'interaction_constraints': None,
'learning_rate': None,
'max_bin': None,
'max_cat_threshold': None,
'max_cat_to_onehot': None,
'max_delta_step': None,
'max_depth': None,
'max_leaves': None,
'min_child_weight': None,
'missing': nan,
'monotone_constraints': None,
'n_estimators': 100,
'n_jobs': None,
'num_parallel_tree': None,
'predictor': None,
'random_state': None,
'reg_alpha': None,
'reg_lambda': None,
'sampling_method': None,
'scale_pos_weight': None,
'subsample': None,
'tree_method': None,
'validate_parameters': None,
'verbosity': None}
D_train = xgb.DMatrix(x_train, label=y_train)
D_test = xgb.DMatrix(x_test, label=y_test)
param = {
'eta': 0.3,
'max_depth': 3,
'objective': 'multi:softprob',
'num_class': 3}
steps = 20
# Training the data on xgb
model = xgb.train(param, D_train, steps)
model = xgb.train(param, D_test, steps)
from sklearn.metrics import precision_score, recall_score, accuracy_score
preds = model.predict(D_train)
best_preds = np.asarray([np.argmax(line) for line in preds])
print("Precision on trained data = {}".format(precision_score(y_train, best_preds, average='macro')))
print("Recall on trained data = {}".format(recall_score(y_train, best_preds, average='macro')))
print("Accuracy on trained data = {}".format(accuracy_score(y_train, best_preds)))
preds = model.predict(D_test)
best_preds = np.asarray([np.argmax(line) for line in preds])
print("Precision on test data = {}".format(precision_score(y_test, best_preds, average='macro')))
print("Recall on test data = {}".format(recall_score(y_test, best_preds, average='macro')))
print("Accuracy on test data = {}".format(accuracy_score(y_test, best_preds)))
Precision on trained data = 0.7432563972247626 Recall on trained data = 0.70874626293265 Accuracy on trained data = 0.7968 Precision on test data = 0.7961161619264083 Recall on test data = 0.7530393222153982 Accuracy on test data = 0.8351101634683724
Performance is better on test data than trained data
# RandomSearch
param_grid = {
"learning_rate": [0.0001,0.001, 0.01, 0.1, 1] ,
"max_depth": range(3,21,3),
"gamma": [i/10.0 for i in range(0,5)],
"colsample_bytree": [i/10.0 for i in range(3,10)],
"reg_alpha": [1e-5, 1e-2, 0.1, 1, 10, 100],
"reg_lambda": [1e-5, 1e-2, 0.1, 1, 10, 100]}
scoring = ['recall']
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
random_search = RandomizedSearchCV(estimator=xgboost,
param_distributions=param_grid,
n_iter=48,
scoring=scoring,
refit='recall',
n_jobs=-1,
cv=kfold,
verbose=0)
random_result = random_search.fit(x_train, y_train)
random_result
RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=0, shuffle=True),
estimator=XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None, feature_types=None,
gamma=None, gpu_id=None,
grow_policy=None,
importanc...
num_parallel_tree=None,
predictor=None, random_state=None, ...),
n_iter=48, n_jobs=-1,
param_distributions={'colsample_bytree': [0.3, 0.4, 0.5, 0.6,
0.7, 0.8, 0.9],
'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
'learning_rate': [0.0001, 0.001, 0.01,
0.1, 1],
'max_depth': range(3, 21, 3),
'reg_alpha': [1e-05, 0.01, 0.1, 1, 10,
100],
'reg_lambda': [1e-05, 0.01, 0.1, 1, 10,
100]},
refit='recall', scoring=['recall'])
print(f'The best score is {random_result.best_score_:.4f}')
print(f'The best hyperparameters are {random_result.best_params_}')
The best score is 0.5383
The best hyperparameters are {'reg_lambda': 100, 'reg_alpha': 10, 'max_depth': 6, 'learning_rate': 1, 'gamma': 0.3, 'colsample_bytree': 0.4}
score is not good with randomsearch
# 1.3b
import pickle
from sklearn.datasets import load_digits
xgb_params = {
'objective': 'binary:logistic',
'reg_lambda': 0.8,
'reg_alpha': 0.4,
'max_depth': 10,
'max_delta_step': 1,
}
clf = xgb.XGBClassifier(**xgb_params)
clf.fit(x, y, eval_metric='auc', verbose=True)
pickle.dump(clf, open("xgb_temp.pkl", "wb"))
clf2 = pickle.load(open("xgb_temp.pkl", "rb"))
assert np.allclose(clf.predict(x), clf2.predict(x))
print(clf2.get_xgb_params())
{'objective': 'binary:logistic', 'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'eval_metric': None, 'gamma': 0, 'gpu_id': -1, 'grow_policy': 'depthwise', 'interaction_constraints': '', 'learning_rate': 0.300000012, 'max_bin': 256, 'max_cat_threshold': 64, 'max_cat_to_onehot': 4, 'max_delta_step': 1, 'max_depth': 10, 'max_leaves': 0, 'min_child_weight': 1, 'monotone_constraints': '()', 'n_jobs': 0, 'num_parallel_tree': 1, 'predictor': 'auto', 'random_state': 0, 'reg_alpha': 0.4, 'reg_lambda': 0.8, 'sampling_method': 'uniform', 'scale_pos_weight': 1, 'subsample': 1, 'tree_method': 'exact', 'validate_parameters': 1, 'verbosity': None}
# GridSearch
from sklearn.model_selection import GridSearchCV
clf = xgb.XGBClassifier()
parameters = {
"eta" : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
"max_depth" : [ 3, 4, 5, 6, 8, 10, 12, 15],
"min_child_weight" : [ 1, 3, 5, 7 ],
"gamma" : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
"colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
}
grid = GridSearchCV(clf,
parameters, n_jobs=4,
scoring="neg_log_loss",
cv=3)
grid.fit(x_train, y_train)
GridSearchCV(cv=3,
estimator=XGBClassifier(base_score=None, booster=None,
callbacks=None, colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
feature_types=None, gamma=None,
gpu_id=None, grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=None,...
max_leaves=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None,
random_state=None, ...),
n_jobs=4,
param_grid={'colsample_bytree': [0.3, 0.4, 0.5, 0.7],
'eta': [0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
'max_depth': [3, 4, 5, 6, 8, 10, 12, 15],
'min_child_weight': [1, 3, 5, 7]},
scoring='neg_log_loss')
param = {
'eta': 0.5,
'max_depth': 5,
'objective': 'multi:softprob',
'num_class': 6}
model = xgb.train(param, D_train, steps)
model = xgb.train(param, D_test, steps)
Performance after tuning parameters
preds = model.predict(D_train)
best_preds = np.asarray([np.argmax(line) for line in preds])
print("Precision on trained data = {}".format(precision_score(y_train, best_preds, average='macro')))
print("Recall on trained data = {}".format(recall_score(y_train, best_preds, average='macro')))
print("Accuracy on trained data = {}".format(accuracy_score(y_train, best_preds)))
preds = model.predict(D_test)
best_preds = np.asarray([np.argmax(line) for line in preds])
print("Precision on test data = {}".format(precision_score(y_test, best_preds, average='macro')))
print("Recall on test data = {}".format(recall_score(y_test, best_preds, average='macro')))
print("Accuracy on test data = {}".format(accuracy_score(y_test, best_preds)))
Precision on trained data = 0.7252854956096846 Recall on trained data = 0.7121855270536075 Accuracy on trained data = 0.7854222222222222 Precision on test data = 0.9308501184834124 Recall on test data = 0.919942205634557 Accuracy on test data = 0.9431414356787491
There is improvement in performance of test data after tuning parameters like 'eta', 'max_depth', 'num_class'.
Precision = 93%, Recall = 91.9%, Accuracy = 94%
# Importing all necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objs as go
pyo.init_notebook_mode()
# As CSV file is already imported in previous step. Performing functions directly on merged dataframe
data_df = pd.merge(telecom1,telecom2,on = 'customerID', how='left')
data_df.head()
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
| 1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | ... | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
| 2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | ... | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | ... | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
| 4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
5 rows × 21 columns
def dataoveriew(df, message):
print('Number of rows: ', df.shape[0])
print("Number of Columns:", df.shape[1])
print("Column names:")
print(df.columns.tolist())
print("Missing values:", df.isnull().sum().values.sum())
print("Unique values:")
print(df.nunique())
dataoveriew(data_df, 'Overview of the dataset')
Number of rows: 7043 Number of Columns: 21 Column names: ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'] Missing values: 0 Unique values: customerID 7043 gender 2 SeniorCitizen 2 Partner 2 Dependents 2 tenure 73 PhoneService 2 MultipleLines 3 InternetService 3 OnlineSecurity 3 OnlineBackup 3 DeviceProtection 3 TechSupport 3 StreamingTV 3 StreamingMovies 3 Contract 3 PaperlessBilling 2 PaymentMethod 4 MonthlyCharges 1585 TotalCharges 6531 Churn 2 dtype: int64
Creating a 'def' function that valuecounts and rounds of to percentage, formates categorical and numerical features for graph visualization of features
def bar(feature, df=data_df ):
temp_df = df.groupby([feature, 'Churn']).size().reset_index()
temp_df = temp_df.rename(columns={0:'Count'})
value_counts_df = df[feature].value_counts().to_frame().reset_index()
categories = [cat[1][0] for cat in value_counts_df.iterrows()]
num_list = [num[1][1] for num in value_counts_df.iterrows()]
div_list = [element / sum(num_list) for element in num_list]
percentage = [round(element * 100,1) for element in div_list]
def num_format(list_instance):
formatted_str = ''
for index,num in enumerate(list_instance):
if index < len(list_instance)-2:
formatted_str=formatted_str+f'{num}%, '
elif index == len(list_instance)-2:
formatted_str=formatted_str+f'{num}% & '
else:
formatted_str=formatted_str+f'{num}%'
return formatted_str
def str_format(list_instance):
formatted_str = ''
for index, cat in enumerate(list_instance):
if index < len(list_instance)-2:
formatted_str=formatted_str+f'{cat}, '
elif index == len(list_instance)-2:
formatted_str=formatted_str+f'{cat} & '
else:
formatted_str=formatted_str+f'{cat}'
return formatted_str
num_str = num_format(percentage)
cat_str = str_format(categories)
fig = px.bar(temp_df, x=feature, y='Count', color='Churn', title=f'Churn rate by {feature}', barmode="group", color_discrete_sequence=["green", "red"])
fig.add_annotation(
text=f'Value count of distribution of {cat_str} are<br>{num_str} percentage respectively.',
align='left',
showarrow=False,
xref='paper',
yref='paper',
x=1.4,
y=1.3,
bordercolor='black',
borderwidth=1)
fig.update_layout(
margin=dict(r=600),
)
return fig.show()
From the above function we can visualize features in bar graph
Bar graph for Demographic features:-
bar('gender')
bar('Partner')
bar('Dependents')
Insights from above Bar Graph:-
-Indicates that 939 females and 930 males were churned from the company.
-1200 customers not having partner were churned, 669 customers having partner are churned. Comparatively customers not having partner were churned more than customers having partner.
-Clearly churning is less with customers having dependents.
-Churning of SeniorCitizens is less.
-There’s a higher proportion of churn in younger customers, customers with no partners, and customers with no dependents.
data_df.loc[data_df.SeniorCitizen==0,'SeniorCitizen'] = "No"
data_df.loc[data_df.SeniorCitizen==1,'SeniorCitizen'] = "Yes"
bar('SeniorCitizen')
Exploring Payment features:-
bar('Contract')
bar('PaperlessBilling')
bar('PaymentMethod')
Insights:-
-The shorter the contract, the higher the churn rate.
-Churn Rate is higher for the customers who opted for paperless billing. About 59.2% of customers use paperless billing.
-Customers who pay with electronic checks are more likely to churn, and this kind of payment is more common than other payment types.
Exploring other features:-
bar('InternetService')
bar('OnlineSecurity')
bar('StreamingTV')
bar('StreamingMovies')
EXPLORING DATA :-
data_df.dtypes
customerID object gender object SeniorCitizen object Partner object Dependents object tenure int64 PhoneService object MultipleLines object InternetService object OnlineSecurity object OnlineBackup object DeviceProtection object TechSupport object StreamingTV object StreamingMovies object Contract object PaperlessBilling object PaymentMethod object MonthlyCharges float64 TotalCharges object Churn object dtype: object
On observing values in the dataset we can notice that tenure, MonthlyCharges and TotalCharges are represented in numbers. But TotalCharges is of 'object' datatype. Hence, we need to convert it into 'float' as it is having continuous numeric value.
data_df['TotalCharges'] = pd.to_numeric(data_df['TotalCharges'],errors='coerce')
DATA PREPROCESSING:-
# Creating pre-processing function for identifying missing values and impute them, dropping unnecessary feature
def preprocessing(df, message):
print("Missing values:", df.isnull().sum().values.sum())
data_df['TotalCharges'] = data_df['TotalCharges'].fillna(data_df['TotalCharges'].median())
data_df.drop(["customerID"],axis=1,inplace = True)
preprocessing(data_df, 'preprocessing')
Missing values: 11
ENCODING CATEGORICAL FEATURES :-
def binary_map(feature):
return feature.map({'Yes':1, 'No':0})
data_df['Churn'] = data_df[['Churn']].apply(binary_map)
data_df['gender'] = data_df['gender'].map({'Male':1, 'Female':0})
binary_list = ['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
data_df[binary_list] = data_df[binary_list].apply(binary_map)
data_df = pd.get_dummies(data_df, drop_first=True)
bin_df = pd.DataFrame()
bin_df['tenure_bins'] = pd.qcut(data_df['tenure'], q=3, labels= ['low', 'medium', 'high'])
bin_df['MonthlyCharges_bins'] = pd.qcut(data_df['MonthlyCharges'], q=3, labels= ['low', 'medium', 'high'])
bin_df['TotalCharges_bins'] = pd.qcut(data_df['TotalCharges'], q=3, labels= ['low', 'medium', 'high'])
bin_df['Churn'] = data_df['Churn']
Plotting histogram for three numeric features of dataset :-
def hist(feature):
group_df = data_df.groupby([feature, 'Churn']).size().reset_index()
group_df = group_df.rename(columns={0: 'Count'})
fig = px.histogram(group_df, x=feature, y='Count', color='Churn', marginal='box', title=f'Churn rate frequency to {feature} distribution', color_discrete_sequence=["green", "red"])
fig.show()
hist('tenure')
hist('MonthlyCharges')
hist('TotalCharges')
CORRELATION :-
corr = data_df.corr()
fig = px.imshow(corr,width=1000, height=1000)
fig.show()
# Generalized Linear Model Regression
import statsmodels.api as sm
import statsmodels.formula.api as smf
all_columns = [column.replace(" ", "_").replace("(", "_").replace(")", "_").replace("-", "_") for column in data_df.columns]
data_df.columns = all_columns
glm_columns = [e for e in all_columns if e not in ['customerID', 'Churn']]
glm_columns = ' + '.join(map(str, glm_columns))
glm_model = smf.glm(formula=f'Churn ~ {glm_columns}', data=data_df, family=sm.families.Binomial())
res = glm_model.fit()
print(res.summary())
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: Churn No. Observations: 7043
Model: GLM Df Residuals: 7019
Model Family: Binomial Df Model: 23
Link Function: Logit Scale: 1.0000
Method: IRLS Log-Likelihood: -2914.7
Date: Sat, 10 Dec 2022 Deviance: 5829.3
Time: 19:40:06 Pearson chi2: 8.04e+03
No. Iterations: 7 Pseudo R-squ. (CS): 0.2807
Covariance Type: nonrobust
=========================================================================================================
coef std err z P>|z| [0.025 0.975]
---------------------------------------------------------------------------------------------------------
Intercept 0.8274 0.748 1.106 0.269 -0.639 2.294
gender -0.0219 0.065 -0.338 0.736 -0.149 0.105
SeniorCitizen 0.2151 0.085 2.545 0.011 0.049 0.381
Partner -0.0027 0.078 -0.035 0.972 -0.155 0.150
Dependents -0.1538 0.090 -1.714 0.087 -0.330 0.022
tenure -0.0594 0.006 -9.649 0.000 -0.071 -0.047
PhoneService 0.5036 0.692 0.728 0.467 -0.852 1.860
PaperlessBilling 0.3418 0.074 4.590 0.000 0.196 0.488
MonthlyCharges -0.0404 0.032 -1.272 0.203 -0.103 0.022
TotalCharges 0.0003 7.01e-05 4.543 0.000 0.000 0.000
MultipleLines_No_phone_service 0.3238 0.106 3.061 0.002 0.116 0.531
MultipleLines_Yes 0.4469 0.177 2.524 0.012 0.100 0.794
InternetService_Fiber_optic 1.7530 0.798 2.198 0.028 0.190 3.316
InternetService_No -0.2559 0.115 -2.220 0.026 -0.482 -0.030
OnlineSecurity_No_internet_service -0.2559 0.115 -2.220 0.026 -0.482 -0.030
OnlineSecurity_Yes -0.2055 0.179 -1.150 0.250 -0.556 0.145
OnlineBackup_No_internet_service -0.2559 0.115 -2.220 0.026 -0.482 -0.030
OnlineBackup_Yes 0.0258 0.175 0.147 0.883 -0.318 0.369
DeviceProtection_No_internet_service -0.2559 0.115 -2.220 0.026 -0.482 -0.030
DeviceProtection_Yes 0.1477 0.176 0.838 0.402 -0.198 0.493
TechSupport_No_internet_service -0.2559 0.115 -2.220 0.026 -0.482 -0.030
TechSupport_Yes -0.1789 0.180 -0.991 0.322 -0.533 0.175
StreamingTV_No_internet_service -0.2559 0.115 -2.220 0.026 -0.482 -0.030
StreamingTV_Yes 0.5912 0.326 1.813 0.070 -0.048 1.230
StreamingMovies_No_internet_service -0.2559 0.115 -2.220 0.026 -0.482 -0.030
StreamingMovies_Yes 0.6038 0.326 1.850 0.064 -0.036 1.244
Contract_One_year -0.6671 0.107 -6.208 0.000 -0.878 -0.456
Contract_Two_year -1.3896 0.176 -7.904 0.000 -1.734 -1.045
PaymentMethod_Credit_card__automatic_ -0.0865 0.114 -0.758 0.448 -0.310 0.137
PaymentMethod_Electronic_check 0.3057 0.094 3.236 0.001 0.121 0.491
PaymentMethod_Mailed_check -0.0567 0.115 -0.493 0.622 -0.282 0.168
=========================================================================================================
Feature importance :-
np.exp(res.params)
Intercept 2.287343 gender 0.978355 SeniorCitizen 1.239957 Partner 0.997312 Dependents 0.857471 tenure 0.942322 PhoneService 1.654668 PaperlessBilling 1.407543 MonthlyCharges 0.960432 TotalCharges 1.000318 MultipleLines_No_phone_service 1.382358 MultipleLines_Yes 1.563475 InternetService_Fiber_optic 5.771657 InternetService_No 0.774257 OnlineSecurity_No_internet_service 0.774257 OnlineSecurity_Yes 0.814269 OnlineBackup_No_internet_service 0.774257 OnlineBackup_Yes 1.026127 DeviceProtection_No_internet_service 0.774257 DeviceProtection_Yes 1.159152 TechSupport_No_internet_service 0.774257 TechSupport_Yes 0.836193 StreamingTV_No_internet_service 0.774257 StreamingTV_Yes 1.806134 StreamingMovies_No_internet_service 0.774257 StreamingMovies_Yes 1.829067 Contract_One_year 0.513185 Contract_Two_year 0.249179 PaymentMethod_Credit_card__automatic_ 0.917142 PaymentMethod_Electronic_check 1.357617 PaymentMethod_Mailed_check 0.944913 dtype: float64
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
data_df['tenure'] = sc.fit_transform(data_df[['tenure']])
data_df['MonthlyCharges'] = sc.fit_transform(data_df[['MonthlyCharges']])
data_df['TotalCharges'] = sc.fit_transform(data_df[['TotalCharges']])
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def splitting(df, message):
X = data_df.drop('Churn', axis=1)
y = data_df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)
splitting(data_df, 'splitting')
# Creating a 'def' function to run any model by simply calling it by name:-
def modeling(alg, alg_name, params={}):
model = alg(**params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
def print_scores(alg, y_true, y_pred):
print(alg_name)
acc_score = accuracy_score(y_true, y_pred)
print("accuracy: ",acc_score)
pre_score = precision_score(y_true, y_pred)
print("precision: ",pre_score)
rec_score = recall_score(y_true, y_pred)
print("recall: ",rec_score)
f_score = f1_score(y_true, y_pred, average='weighted')
print("f1_score: ",f_score)
print_scores(alg, y_test, y_pred)
return model
# Running logistic regression model
log_model = modeling(LogisticRegression, 'Logistic Regression')
Logistic Regression accuracy: 0.7983909133932797 precision: 0.6281800391389433 recall: 0.5763016157989228 f1_score: 0.7955174819104321
# Feature selection to improve model building
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
log = LogisticRegression()
rfecv = RFECV(estimator=log, cv=StratifiedKFold(10, random_state=50, shuffle=True), scoring="accuracy")
rfecv.fit(X, y)
RFECV(cv=StratifiedKFold(n_splits=10, random_state=50, shuffle=True),
estimator=LogisticRegression(), scoring='accuracy')
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(rfecv.grid_scores_)+1), rfecv.grid_scores_)
plt.grid()
plt.xticks(range(1, X.shape[1]+1))
plt.xlabel("Number of Selected Features")
plt.ylabel("CV Score")
plt.title("Recursive Feature Elimination (RFE)")
plt.show()
print("The optimal number of features: {}".format(rfecv.n_features_))
The optimal number of features: 23
Trying other machine learning algorithms :-
# Running different models by simply calling modeling :-
# Running logistic regression model
log_model = modeling(LogisticRegression, 'Logistic Regression')
Logistic Regression accuracy: 0.7983909133932797 precision: 0.6281800391389433 recall: 0.5763016157989228 f1_score: 0.7955174819104321
# SVC
svc_model = modeling(SVC, 'SVC Classification')
SVC Classification accuracy: 0.795551348793185 precision: 0.6355748373101953 recall: 0.526032315978456 f1_score: 0.7889704158679894
# Random forest
rf_model = modeling(RandomForestClassifier, "Random Forest Classification")
Random Forest Classification accuracy: 0.780407004259347 precision: 0.5995717344753747 recall: 0.5026929982046678 f1_score: 0.7738430191302584
# Decision tree
dt_model = modeling(DecisionTreeClassifier, "Decision Tree Classification")
Decision Tree Classification accuracy: 0.7292948414576431 precision: 0.48776508972267535 recall: 0.5368043087971275 f1_score: 0.7332929726420037
# Naive bayes
nb_model = modeling(GaussianNB, "Naive Bayes Classification")
Naive Bayes Classification accuracy: 0.6469474680548982 precision: 0.42011834319526625 recall: 0.8922800718132855 f1_score: 0.6660052398768987
Among all models Logistic Regression shows best accuracy followed by SVC, Random forest, Decision tree and Naive bayes model respectively.
# 'hist' and 'bar' were created for graphical visualization of certain features
Summary of all 'def' functions with which we can simply call a model by its name :-
dataoveriew # Overview of data
bar # Formatting string and numerical values
preprocessing # Preparation of data for performing models
binary_map # Encoding variables
splitting # Splitting data into 'x' and 'y'
model = modeling(LogisticRegression, 'Logistic Regression') # Simply calling model with best performance
Logistic Regression accuracy: 0.7983909133932797 precision: 0.6281800391389433 recall: 0.5763016157989228 f1_score: 0.7955174819104321